{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7b91d206-39f3-4064-8692-d29d59c86303",
   "metadata": {},
   "source": [
    "## Vector Database setup"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "346ea881-5faa-4d7f-b317-3b51a3d2bc15",
   "metadata": {},
   "source": [
    "Remove old Weaviate DB files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "010cae0d-c7c6-4cf0-aab5-e1b149d27be6",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "!rm -rf ~/.local/share/weaviate"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e4f866a-f746-4c88-8468-81ed859dfdad",
   "metadata": {},
   "source": [
    "\n",
    "### Step 1 - Download sample data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8d8b55a",
   "metadata": {
    "height": 251
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "\n",
    "# Download the data\n",
    "resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')\n",
    "data = json.loads(resp.text)  # Load data\n",
    "\n",
    "# Parse the JSON and preview it\n",
    "print(type(data), len(data))\n",
    "\n",
    "def json_print(data):\n",
    "    print(json.dumps(data, indent=2))\n",
    "\n",
    "json_print(data[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c56b4975-bce1-4805-8369-e680d28cf9a9",
   "metadata": {},
   "source": [
    "### Step 2 - Create an embedded instance of Weaviate vector database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1504f6fc-c41a-4c53-966a-4bdb0ac02dcc",
   "metadata": {
    "height": 285
   },
   "outputs": [],
   "source": [
    "import weaviate, os\n",
    "from weaviate import EmbeddedOptions\n",
    "import openai\n",
    "\n",
    "from dotenv import load_dotenv, find_dotenv\n",
    "_ = load_dotenv(find_dotenv()) # read local .env file\n",
    "openai.api_key = os.environ['OPENAI_API_KEY']\n",
    "\n",
    "client = weaviate.Client(\n",
    "    embedded_options=EmbeddedOptions(),\n",
    "    additional_headers={\n",
    "        \"X-OpenAI-BaseURL\": os.environ['OPENAI_API_BASE'],\n",
    "        \"X-OpenAI-Api-Key\": openai.api_key  # Replace this with your actual key\n",
    "    }\n",
    ")\n",
    "print(f\"Client created? {client.is_ready()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42f88930-693d-4041-a93c-280f00d8a75d",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "json_print(client.get_meta())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7eb24c9-1f47-402f-ae21-58cad28c796f",
   "metadata": {},
   "source": [
    "## Step 3 - Create Question collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "654b58e4-ad12-461c-951b-bf5ed372ee82",
   "metadata": {
    "height": 302
   },
   "outputs": [],
   "source": [
    "# resetting the schema. CAUTION: This will delete your collection \n",
    "if client.schema.exists(\"Question\"):\n",
    "    client.schema.delete_class(\"Question\")\n",
    "class_obj = {\n",
    "    \"class\": \"Question\",\n",
    "    \"vectorizer\": \"text2vec-openai\",  # Use OpenAI as the vectorizer\n",
    "    \"moduleConfig\": {\n",
    "        \"text2vec-openai\": {\n",
    "            \"model\": \"ada\",\n",
    "            \"modelVersion\": \"002\",\n",
    "            \"type\": \"text\",\n",
    "            \"baseURL\": os.environ[\"OPENAI_API_BASE\"]\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "client.schema.create_class(class_obj)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b35a3a84-b944-4434-86eb-cbf0b7d6af12",
   "metadata": {},
   "source": [
    "## Step 4 - Load sample data and generate vector embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30c7d265-4380-4cb5-a494-bd759bbf7297",
   "metadata": {
    "height": 47
   },
   "outputs": [],
   "source": [
    "# reminder for the data structure\n",
    "json_print(data[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95e3d133-5b40-4b46-8719-d2cc70fffd1b",
   "metadata": {
    "height": 268
   },
   "outputs": [],
   "source": [
    "with client.batch.configure(batch_size=5) as batch:\n",
    "    for i, d in enumerate(data):  # Batch import data\n",
    "        \n",
    "        print(f\"importing question: {i+1}\")\n",
    "        \n",
    "        properties = {\n",
    "            \"answer\": d[\"Answer\"],\n",
    "            \"question\": d[\"Question\"],\n",
    "            \"category\": d[\"Category\"],\n",
    "        }\n",
    "        \n",
    "        batch.add_data_object(\n",
    "            data_object=properties,\n",
    "            class_name=\"Question\"\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25dc32a2-1629-4149-97a9-1d13d49f132d",
   "metadata": {
    "height": 47
   },
   "outputs": [],
   "source": [
    "count = client.query.aggregate(\"Question\").with_meta_count().do()\n",
    "json_print(count)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e128da9-8b9c-4e93-8077-ea35c2f27d4e",
   "metadata": {},
   "source": [
    "## Let's Extract the vector that represents each question!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8af97b0b-8e6b-4816-95ac-a31f9e2d80ab",
   "metadata": {
    "height": 149
   },
   "outputs": [],
   "source": [
    "# write a query to extract the vector for a question\n",
    "result = (client.query\n",
    "          .get(\"Question\", [\"category\", \"question\", \"answer\"])\n",
    "          .with_additional(\"vector\")\n",
    "          .with_limit(1)\n",
    "          .do())\n",
    "\n",
    "json_print(result)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5153263-4464-4e45-8da8-fd47ec95fb4e",
   "metadata": {},
   "source": [
    "## Query time\n",
    "What is the distance between the `query`: `biology` and the returned objects?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42e3d49a-5033-4742-8acf-6427980b9f54",
   "metadata": {
    "height": 183
   },
   "outputs": [],
   "source": [
    "response = (\n",
    "    client.query\n",
    "    .get(\"Question\",[\"question\",\"answer\",\"category\"])\n",
    "    .with_near_text({\"concepts\": \"biology\"})\n",
    "    .with_additional('distance')\n",
    "    .with_limit(2)\n",
    "    .do()\n",
    ")\n",
    "\n",
    "json_print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "691aec17-32f5-4df3-a066-5017be268a1f",
   "metadata": {
    "height": 183
   },
   "outputs": [],
   "source": [
    "response = (\n",
    "    client.query\n",
    "    .get(\"Question\", [\"question\", \"answer\"])\n",
    "    .with_near_text({\"concepts\": [\"animals\"]})\n",
    "    .with_limit(10)\n",
    "    .with_additional([\"distance\"])\n",
    "    .do()\n",
    ")\n",
    "\n",
    "json_print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "806d0b48-e03a-42ff-b3c7-ff640bb4bccb",
   "metadata": {},
   "source": [
    "## We can let the vector database know to remove results after a threshold distance!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afe873c5-0ca3-470b-a40d-9a44361fddfb",
   "metadata": {
    "height": 183
   },
   "outputs": [],
   "source": [
    "response = (\n",
    "    client.query\n",
    "    .get(\"Question\", [\"question\", \"answer\"])\n",
    "    .with_near_text({\"concepts\": [\"animals\"], \"distance\": 0.24})\n",
    "    .with_limit(10)\n",
    "    .with_additional([\"distance\"])\n",
    "    .do()\n",
    ")\n",
    "\n",
    "json_print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9875f5a5-f5f1-4642-bd02-28d5a66a90c0",
   "metadata": {},
   "source": [
    "## Vector Databases support for CRUD operations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0e4b598-1cb7-4a3b-a230-5c953a43ead1",
   "metadata": {},
   "source": [
    "### Create"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e068287-69df-4792-98de-1d4a273381b7",
   "metadata": {
    "height": 166
   },
   "outputs": [],
   "source": [
    "#Create an object\n",
    "object_uuid = client.data_object.create(\n",
    "    data_object={\n",
    "        'question':\"Leonardo da Vinci was born in this country.\",\n",
    "        'answer': \"Italy\",\n",
    "        'category': \"Culture\"\n",
    "    },\n",
    "    class_name=\"Question\"\n",
    " )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a769490e-b6ac-41b2-98dd-ae641692331e",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "print(object_uuid)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e98cd37f-4e4c-46fb-b0cd-ec0813de30d3",
   "metadata": {},
   "source": [
    "### Read"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fbc4b12-91d0-40d8-86ba-12cfc24a55a6",
   "metadata": {
    "height": 47
   },
   "outputs": [],
   "source": [
    "data_object = client.data_object.get_by_id(object_uuid, class_name=\"Question\")\n",
    "json_print(data_object)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91a962a6-0406-4750-9fb4-c7520936c864",
   "metadata": {
    "height": 132
   },
   "outputs": [],
   "source": [
    "data_object = client.data_object.get_by_id(\n",
    "    object_uuid,\n",
    "    class_name='Question',\n",
    "    with_vector=True\n",
    ")\n",
    "\n",
    "json_print(data_object)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba0b2cf0-5cc8-4a24-8685-6791f1bdf2c3",
   "metadata": {},
   "source": [
    "### Update"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9daf260-e0e0-489f-83af-05b762e01171",
   "metadata": {
    "height": 115
   },
   "outputs": [],
   "source": [
    "client.data_object.update(\n",
    "    uuid=object_uuid,\n",
    "    class_name=\"Question\",\n",
    "    data_object={\n",
    "        'answer':\"Florence, Italy\"\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a743d481-1a25-47b6-82ac-c3c3c3b02b8d",
   "metadata": {
    "height": 115
   },
   "outputs": [],
   "source": [
    "data_object = client.data_object.get_by_id(\n",
    "    object_uuid,\n",
    "    class_name='Question',\n",
    ")\n",
    "\n",
    "json_print(data_object)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c317c5a-6fc9-47f3-b8c3-d2aff2ab6cb5",
   "metadata": {},
   "source": [
    "### Delete"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9ddc0e3-8d18-4908-93be-626ddf973b44",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "json_print(client.query.aggregate(\"Question\").with_meta_count().do())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64118e7a-2b9d-425f-a922-54b98103a9b6",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "client.data_object.delete(uuid=object_uuid, class_name=\"Question\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53fa8f44-ec03-425a-a567-ebb3825b7cf0",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "json_print(client.query.aggregate(\"Question\").with_meta_count().do())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "676f1722-9690-43a0-b256-c59d678570ad",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af0d7d1e-3299-4db6-b936-126f98b79ddf",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c1a711c-941d-47b4-8d3d-eec81ee673b2",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "354c46c5-0f6e-44dc-8fd7-2cb56036efaf",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d43bcc4b-08b7-4999-89ce-6dd9e23874c2",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "380a4575-92ad-4df4-b4ec-49f7ef2152c3",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c9bf4e6-8f57-473c-a314-9fc850c50c91",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "503a99f6-a380-4e72-ae8b-a99951517cbf",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbac55fe-9816-4f6e-b640-d259fc0f5f54",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1564356-b49c-42a5-8e45-0e82cda03fe4",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44d1a9e2-bcf0-4fc2-9a59-8a6f3128becb",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e766822-155c-42a3-a291-30af0a6ffabd",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f57c10f-8805-41ef-83c1-30d37f2fbf76",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26cbe50e-1de4-438f-a4fb-848bf211688e",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11910ff5-f365-4b58-82ce-8831851b9334",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5484cbbf-fc73-4f9a-9075-1714de82e0eb",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d85e7588-1ab3-4fd8-8541-67eb9c20c854",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4f4e1f2-0125-49c2-a0d5-145a13997ca2",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdac0477-2358-4c0f-a37c-29661cd259bf",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c56dbddf-307f-4ac2-b96e-2539641fe935",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "420996d3-db78-434e-8b2c-0c1a1003d967",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "766bf4b7-8151-497f-ba29-247ec46f890d",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a093aaf-6bb0-453c-90b7-5135fe3ecbbb",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4b4ef7e-5f86-44bb-99cc-2068c654e7eb",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40233c35-1ec8-4b73-a46b-65c79628edf0",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6d9b865-8f1b-4f7b-a16b-f75e56b3ae6f",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e31c7fa-eabb-47c3-a8be-784e96c6a31c",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "018b03f2-684a-4368-9932-e6e32c4a1992",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1edeadd9-8ffb-4834-ac30-63e4967b810c",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4508bf16-42ed-43fa-a940-e643e58bdcac",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d8eb5b6-c844-41a6-ad1a-ee89cc5331d3",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "570e59af-d194-4b00-803a-92e2580df8d0",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa7721a0-4975-4629-a2df-8b8d6a3bcae9",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bfa03c7-767b-4325-885a-ecdbd4201b22",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c887d241-e164-43ee-b224-d08c87bb6578",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75904353-4870-4b49-9142-a85a0fa482a5",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ea5f57f-b545-4c0b-a599-74a24b2946a6",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa7c0947-a958-4d47-8e35-b0b5cb6c9f45",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eac84802-54d9-4224-843b-e136e9f33bf7",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0d2f56d-db80-4f70-85f5-59043e3806cb",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "939ae1a7-5730-4131-9e52-acd25c96ac71",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c5abb1a-19f2-44c7-9b18-712dd6aa2a66",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
